Data preparation

Load packages

library(tidyverse)
library(tidytext)
library(igraph)
library(ggraph)
library(stringr)
library(widyr)
library(knitr)
library(topicmodels)

Get data.

dat <- read_csv("data/final_corpus.csv") %>%
  filter(!is.na(title))

Make a data frame that lists all the authors in separate columns for each paper.

author_df <- dat %>% select(id, authors) %>% 
  unnest_tokens(output = authors_long, input = authors, token = stringr::str_split, pattern = ";")

Convert each author name to “last, first initial” format.

author_df <- author_df %>% 
  mutate(authors_long = str_trim(authors_long)) %>%
  mutate(last = ifelse(grepl(",", authors_long) == TRUE,
    str_extract(authors_long, "[^,]*"),
    str_extract(authors_long, "[^ ]*$"))) %>%
  mutate(first_init = ifelse(grepl(",", authors_long) == TRUE,
                             strsplit(authors_long, " "),
                             str_sub(authors_long, start = 1, end = 1))) %>%
  unnest(first_init) %>%
  group_by(authors_long, id) %>%
  slice(2) %>%
  ungroup() %>%
  arrange(id) %>%
  mutate(first_init = str_sub(first_init, 1, 1)) %>%
  mutate(author = paste0(first_init, ". ", last)) %>%
  select(-last, -first_init)

kable(head(author_df))
id authors_long author
17 cannon, ellie e. cannon
17 copenhaver-parry, paige e. p. copenhaver-parry
18 betts, matthew g. m. betts
18 frey, sarah j. k. s. frey
18 hadley, adam s. a. hadley
20 barnhart, theodore b. t. barnhart
write_csv(author_df, "data/authors.csv")

Now column “author” contains the most standard version. It looks like we have 1966 unique authors.

Get pairwise author count:

author_pairs <- author_df %>%
  pairwise_count(author, id, sort = TRUE, upper = FALSE)
names(author_pairs)[1:2] <- c("author1", "author2")
kable(head(author_pairs))
author1 author2 n
w. romme m. turner 9
j. bradford w. lauenroth 6
a. hamlet d. lettenmaier 6
l. leung y. qian 6
d. horan d. isaak 5
d. isaak d. nagel 5

Make a yearly version of author pairs as well:

author_df <- full_join(author_df, dat[, c("id", "year")], by = "id")
author_df <- author_df %>% filter(!is.na(year))

years <- unique(author_df$year)
pair_dfs <- vector("list", length(years))
for(i in 1:length(years)){
  df <- author_df %>% filter(year == years[i])
  pair_dfs[[i]] <- df %>% 
    pairwise_count(author, id, sort = TRUE, upper = FALSE) %>%
    mutate(year = years[i])
}

author_pairs_years <- bind_rows(pair_dfs)
names(author_pairs_years)[1:2] <- c("author1", "author2")

#Subset to years with > 50 author collaborations in our dataset.
#(save a copy with the full dataset for later).
apy_full <- author_pairs_years
years <- author_pairs_years %>% group_by(year) %>% count() %>% filter(nn > 50)
author_pairs_years <- author_pairs_years %>% filter(year %in% years$year)

Network analysis:

Simple network of authors.

Plot network of author collaborations in cases where there are 2 or more collaborations.

set.seed(1234)
author_pairs%>%
  filter(n > 2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "kk") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 2) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines"),
                 size = 2) +
  theme_void()

Looks like this tells us that most of the researchers in this network are operating as isolated small groups, rather than having strong inter-group collaboration - although, collaborations with less than 3 counts were left out in the interest of computational efficiency, so that may be hiding some interesting collaboration structure. Go ahead and make the full network (don’t include text; there will be 1897 authors - which also means that there are 69 authors who have only sole-authored items). This takes ~5 minutes.

set.seed(1234)
author_pairs%>%
  graph_from_data_frame() %>%
  ggraph(layout = "kk") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 1) +
  geom_node_text(aes(label = name), repel = FALSE, 
                 point.padding = unit(0.2, "lines"),
                 size = 2) +
  theme_void()
## Warning: Ignoring unknown parameters: point.padding

Wow, that makes it look like there are a bunch of really central authors that connect to more peripheral ones. How do other network layouts affect this?

set.seed(1234)
author_pairs%>%
  graph_from_data_frame() %>%
  ggraph(layout = "drl") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 0.1) +
  #geom_node_text(aes(label = name), repel = FALSE, 
  #               point.padding = unit(0.2, "lines"),
  #               size = 2) +
  theme_void()

How about a hive plot? This would solve the problem that different layouts suggest different information about the network. There are probably a number of ways we could structure a hive plot - but one would be to have three axes, representing physical sciences, social sciences, and life sciences - and then each axis would be doubled, so we could see collaborations within the axis, and collaborations across disciplines.

graph <- graph_from_data_frame(author_pairs, directed = FALSE)
V(graph)$friends <- degree(graph, mode = 'all')
V(graph)$friends <- ifelse(V(graph)$friends < 3, 'few', 
                           ifelse(V(graph)$friends >= 10, 'many', 'medium'))
ggraph(graph, 'hive', axis = 'friends') + 
    geom_edge_hive(aes(edge_alpha = n)) + 
    geom_axis_hive(aes(colour = friends), 
                   alpha = 1, size = 2, label = FALSE) + 
    coord_fixed() +
    labs(color = "Collaborators") +
    theme_void()

Now, try a graph that facets the networks by year. First, do this wtih only author pairs with greater than 1 collaboration - this may not be the most interesting piece of information, but it’s a little easier to read than a version with all the authors included. It does look like we might be moving towards more networks, and possibly more interconnectedness of networks over time? There are really dense clumps in 2008-2010, and a greater number of clumps with more interaction between them in more recent years? However, this interpretation depends strongly on what graphical layout we use; this would be a good thing to research more.

How about if we do this with all collaborations included?

Descriptive statistics

Actor-level statistics

Finally, let’s calculate some descriptive statistics on actors in this network - this is just a start, and a place we’ll need to come back to. First, eigenvector centrality:

graph_obj <- graph_from_data_frame(author_pairs, directed = FALSE)
ec <- eigen_centrality(graph_obj)[[1]]
ec_df <- data.frame(ec) 
ec_df <- ec_df %>% mutate(name = rownames(ec_df)) %>%
  arrange(desc(ec))
names(ec_df[1]) <- "eigen_centrality"

kable(ec_df[1:20,])
ec name
1.0000000 p. morgan
0.9705727 j. abatzoglou
0.9096055 t. link
0.8936240 a. meddens
0.8901255 k. vierling
0.8894930 t. hall
0.8894930 p. klos
0.8894930 k. kemp
0.8894930 j. blades
0.8873678 j. holbrook
0.8862137 m. clark
0.8833284 r. niemeyer
0.8753721 a. haruch
0.8753721 l. mitchell
0.8753721 m. dodd
0.8753721 b. soderquist
0.8753721 a. bean
0.8753721 t. magney
0.8753721 c. walsh
0.8753721 v. jansen

Wow, the dominance of UI people in this list is actually sort of disturbing…? Eigenvector centrality can be thought of as “the sum of an actor’s connections to other actors, weighted by those other actors’ degree centrality” (Bodin and Prell 2011, originally Borgatti, 1995).

Let’s calculate some other metrics so we have a good summary.

bet <- betweenness(graph_obj) %>%
  data.frame() 
bet <- bet %>% mutate(name = rownames(bet))
names(bet)[1] <- "betweenness"

deg <- degree(graph_obj) %>% 
  data.frame()
deg <- deg %>% mutate(name = rownames(deg))
names(deg)[1] <- "degree_centrality"

actor_stats <- full_join(ec_df, bet, by = "name") %>%
  full_join(deg, by = "name") %>%
  select(name, everything()) %>%
  arrange(desc(degree_centrality)) 

kable(actor_stats[1:20,])
name ec betweenness degree_centrality
j. abatzoglou 0.9705727 13072.2891 64
j. hicke 0.1279697 1721.2890 58
p. morgan 1.0000000 4036.3391 55
b. bentz 0.0275886 0.0000 45
j. littell 0.0329091 8658.4189 43
d. peterson 0.0234773 801.9984 41
t. link 0.9096055 1018.7381 38
t. spies 0.1402119 1578.8000 38
s. stephens 0.1426109 852.1667 37
r. keane 0.1397720 2433.5167 37
c. miller 0.1859048 4946.4762 36
l. graumlich 0.0103135 3209.3167 36
k. raffa 0.0650124 9417.9923 35
c. tague 0.0340981 1244.3262 35
m. clark 0.8862137 2720.6060 32
c. millar 0.0043995 2340.0000 32
a. meddens 0.8936240 1958.3473 31
k. vierling 0.8901255 328.0000 31
d. fagre 0.0068962 1975.8599 31
r. moore 0.0010828 599.0000 31

The order of actors changes substantially depending on what metric is used, which is kind of interesting to note (although frankly, J Abatzoglou is pretty much killing it no matter which metric we use). This analysis becomes even more interesting (in my opinion) if we can pair it with more information about these authors: their disciplines, genders, and institutions.

Network level statistics

Calculating some network-level statistics will let us summarize network characteristics and look at changes over time.

centr <- centr_degree(graph_obj)
centr$centralization
## [1] 0.01499632
centr$theoretical_max
## [1] 7189632

Wow, that is very low centralization relative to the theoretical maximum. How does centralization change over time?

years <- unique(apy_full$year)

centr_time <- data.frame(year = years, centralization = NA, tmax = NA)
for(i in 1:length(years)){
  df <- apy_full %>% filter(year == years[i])
  graph_obj <- graph_from_data_frame(df)
  x <- centr_degree(graph_obj)
  centr_time$centralization[i] <- x$centralization
  centr_time$tmax[i] <- x$theoretical_max
}

#Divide centralization/tmax to see what fraction of max is achieved.
centr_time <- centr_time %>% mutate(f = centralization/tmax)

#Plot
plot_dat <- data.table::melt(centr_time, id.vars = "year")

p <- ggplot(plot_dat, aes(x = year, y = value)) + 
  geom_point() +
  geom_smooth() +
  facet_wrap(~variable, scales = "free", dir = "h")
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Plot with log transformation:
p <- p + scale_y_continuous(trans = "log")
p
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).

I think the first and third plots are most relevant here: the first is the network centralization overall, and the third (“f”) is the fraction of the theoretical maximum centralization that was achieved. When we look at these plots without any kind of transformation, there’s not much to see - but when they’re transformed on a log scale, it looks like there might be an interesting pattern in degree centralization. One caution though is not to over-interpret this until we absolutely have our final corpus.

Another idea for descriptive statistics is to calculate some subgroup statistics??

Topic modeling

Join authors with subject data (using journal, discipline, keyword?). A possible approach here is to do some topic modeling with the keywords (276 papers are missing keyword data), categorize papers by topic, and then do network analysis grouped by topic. This is probably a good idea anyway because we want to split papers by topic for when we code for content.

keyword_df <- dat %>%
  dplyr::select(id, keywords) %>%
  mutate(keywords = gsub(",", ";", keywords)) %>%
  unnest_tokens(input = keywords, output = keywords, token = stringr::str_split, pattern = ";") %>%
  mutate(keywords = str_trim(keywords)) %>%
  filter(!is.na(keywords))

We’ve got 2136 unique keywords. Let’s look at keyword pairs to see how they’re grouped.

keyword_pairs <- keyword_df %>%
  pairwise_count(keywords, id, sort = TRUE, upper = FALSE)
set.seed(1234)
keyword_pairs %>%
  filter(n >= 10) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "salmon") +
  geom_node_point(size = 2) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Wow, that keyword network seems to be very centralized. And I worry a little bit that the structure we see here is just an artifact of how people happen to decide on topics.

keyword_cors <- keyword_df %>% 
  group_by(keywords) %>%
  #filter(n() >= 50) %>%
  pairwise_cor(keywords, id, sort = TRUE, upper = FALSE)

#Take out keyword correlations with correlation 1; these are redundant.
keyword_cors <- keyword_cors %>%
  filter(round(correlation, 3) < 1)

kable(head(keyword_cors))
item1 item2 correlation
plant pathogenic fungi plant pathogens 0.9347054
fungal diseases plant pathogenic fungi 0.9251195
plant pests insect pests 0.9135562
aquatic plants phytoplankton 0.9121812
air pollutants air pollution 0.9121812
arthropod pests pests 0.9031583

Visualize the network of keyword correlations:

set.seed(1234)
keyword_cors %>%
  filter(correlation > .7) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "kk") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "darkorchid") +
  geom_node_point(size = 1) +
 # geom_node_text(aes(label = name), repel = FALSE,
                 #point.padding = unit(0.2, "lines"),
  #               size = 2) +
  theme_void()

Well, that’s extremely difficult to read, but it does look like there’s some more meaningful grouping with correlations than there was with just pairwise analysis.

Let’s move on to some topic modeling to see if we can group papers. First, define stop words in addition to the common ones, and get word counts.

my_stop_words <- data_frame(word = c("climate change", "usa"),
                                      lexicon = rep("custom", 2))

word_counts <- keyword_df %>%
  rename(word = keywords) %>%
  anti_join(my_stop_words) %>%
  count(id, word, sort = TRUE) %>%
  ungroup() %>%
  arrange(-n)
## Joining, by = "word"
word_counts
## # A tibble: 6,461 x 3
##       id                      word     n
##    <int>                     <chr> <int>
##  1  1194         dissolved organic     2
##  2    17            bayesian model     1
##  3    17         distribution edge     1
##  4    17        distribution shift     1
##  5    17         plant performance     1
##  6    17      sensitivity analysis     1
##  7    18               composition     1
##  8    18  dynamic occupancy models     1
##  9    18 forest bird distributions     1
## 10    18      forest structure and     1
## # ... with 6,451 more rows

This topic modeling approach may not make a lot of sense since keywords generally only appear once… but continue anyway.

keyword_dtm <- word_counts %>%
  cast_dtm(id, word, n)

keyword_lda <- LDA(keyword_dtm, k = 8, control = list(seed = 1234))
tidy_lda <- tidy(keyword_lda)

top_terms <- tidy_lda %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  mutate(term = factor(paste(term, topic, sep = "__"), 
                       levels = rev(paste(term, topic, sep = "__")))) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
  labs(title = "Top 10 terms in each LDA topic",
       x = NULL, y = expression(beta)) +
  facet_wrap(~ topic, ncol = 4, scales = "free")

These look like they’re somewhat informative, but imperfect. They would probably be more informative if we did this same procedure with full texts or abstracts (which might be a good next step, using crminer).

For now, can we categorize each paper by its topic?

lda_gamma <- tidy(keyword_lda, matrix = "gamma")
id_topic <- lda_gamma %>% 
  group_by(document) %>%
  filter(gamma == max(gamma)) %>%
  ungroup() %>%
  select(document, topic) %>%
  rename(id = document) %>%
  mutate(id = as.integer(id))

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

network_df <- left_join(author_df, id_topic, by = "id") 
author_topics <- network_df %>% 
  group_by(author) %>%
  summarise(topic_new = Mode(topic))
kable(head(author_topics))
author topic_new
a. adams 5
a. adeloye 1
a. ager 5
a. aldous 6
a. andrade 5
a. armstrong 7
#author_pairs <- author_df %>%
#  pairwise_count(author, id, sort = TRUE, upper = FALSE)
#names(author_pairs)[1:2] <- c("author1", "author2")
#kable(head(author_pairs))

Topic modelling with abstracts:

abstract_df <- dat %>% select(id, abstract) %>%
  filter(!is.na(abstract)) %>%
  unnest_tokens(word, abstract) %>%
  anti_join(stop_words)
## Joining, by = "word"
abstract_df %>% count(word, sort = TRUE)
## # A tibble: 9,257 x 2
##             word     n
##            <chr> <int>
##  1       climate  2000
##  2        change  1067
##  3       species   772
##  4   temperature   720
##  5 precipitation   646
##  6         model   597
##  7         water   578
##  8        forest   544
##  9          fire   498
## 10          1999   486
## # ... with 9,247 more rows
#Looks like there are some numbers... maybe get rid of those.
#Also, change everything to lower case. 
abstract_df <- abstract_df %>%
  filter(!grepl("[[:digit:]]", word)) %>%
  mutate(word = tolower(word))

#Need to add a few more stop words: 
abstract_df <- abstract_df %>%
  filter(!word %in% c("org", "http", "xhtml", "xmlns"))
  
library(widyr)
abstract_word_pairs <- abstract_df %>%
  pairwise_count(word, id, sort = TRUE, upper = FALSE)

#this is slow - only run if needed. 
# abstract_cors <- abstract_df %>% 
#   group_by(word) %>%
#   #filter(n() >= 50) %>%
#   pairwise_cor(word, id, sort = TRUE, upper = FALSE)

#Calculate tf_idf.
abstract_tf_idf <- abstract_df %>% 
  count(id, word, sort = TRUE) %>%
  ungroup() %>%
  bind_tf_idf(word, id, n)
word_counts <- abstract_df %>%
  count(id, word, sort = TRUE) %>%
  ungroup()
word_counts
## # A tibble: 79,097 x 3
##       id     word     n
##    <int>    <chr> <int>
##  1  2375  glacier    22
##  2  2444  glacier    22
##  3   615  degrees    19
##  4  2475    aspen    18
##  5  2875  species    18
##  6  3078    water    18
##  7    99 recharge    17
##  8   194  species    17
##  9   700   stream    17
## 10   821     snow    17
## # ... with 79,087 more rows
abstract_dtm <- word_counts %>%
  cast_dtm(id, word, n)

#This may take a while. 
abstract_lda <- LDA(abstract_dtm, k = 8, control = list(seed = 1234))
tidy_lda <- tidy(abstract_lda)

top_terms <- tidy_lda %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms
## # A tibble: 80 x 3
##    topic        term        beta
##    <int>       <chr>       <dbl>
##  1     1        fire 0.033224806
##  2     1      forest 0.017407021
##  3     1      carbon 0.011841713
##  4     1     forests 0.010755105
##  5     1        soil 0.009741968
##  6     1    severity 0.008870568
##  7     1      burned 0.008357221
##  8     1 disturbance 0.007976146
##  9     1    wildfire 0.007410203
## 10     1       fires 0.007398807
## # ... with 70 more rows
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  mutate(term = factor(paste(term, topic, sep = "__"), 
                       levels = rev(paste(term, topic, sep = "__")))) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
  labs(title = "Top 10 terms in each LDA topic",
       x = NULL, y = expression(beta)) +
  facet_wrap(~ topic, ncol = 4, scales = "free")

lda_gamma <- tidy(abstract_lda, matrix = "gamma")

ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
  geom_histogram(show.legend = FALSE, bins = 40) +
  facet_wrap(~ topic, ncol = 4) +
  scale_y_log10() +
  labs(title = "Distribution of probability for each topic",
       y = "Number of documents", x = expression(gamma))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 7 rows containing missing values (geom_bar).

It doesn’t look like these are very well separated: in an ideal world where each document fits perfectly into one category, we would just have bars at the far left and right sides of each facet.

Repeat the topic-modeling with term frequency-inverse document frequency (TF-IDF) instead of word counts:

## # A tibble: 80 x 3
##    topic     term        beta
##    <int>    <chr>       <dbl>
##  1     1     fire 0.011607530
##  2     1   forest 0.005911603
##  3     1    stand 0.005449802
##  4     1 severity 0.005425651
##  5     1   burned 0.005305420
##  6     1    fires 0.005079615
##  7     1    aspen 0.004897655
##  8     1   carbon 0.004886524
##  9     1   beetle 0.004877169
## 10     1 wildfire 0.004625715
## # ... with 70 more rows

## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 11 rows containing missing values (geom_bar).

I’ve played around a little with different numbers of topics, and I don’t think the gammas are a lot better with fewer than more topics - although, selecting only four topics (for example) makes the topics a lot more intelligible to me.

Authors per publication over time

Here’s a simple thing we could have done much earlier: does the number of authors per publication change over time?

aut_pub <- author_df %>%
  group_by(id) %>% 
  count() %>%
  full_join(author_df[,c("id", "year")], by = "id") 

ggplot(aut_pub, aes(x = year, y = n)) +
  #geom_point(alpha = 0.1) +
  geom_hex() + 
  scale_fill_viridis(option = "A") +
  geom_smooth(method = "lm")

fit <- lm(n ~ year, data = aut_pub)
summary(fit)
## 
## Call:
## lm(formula = n ~ year, data = aut_pub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8413 -2.6587 -1.1110  0.8974 19.5238 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -361.43220   31.86969  -11.34   <2e-16 ***
## year           0.18258    0.01585   11.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.526 on 3137 degrees of freedom
## Multiple R-squared:  0.04058,    Adjusted R-squared:  0.04028 
## F-statistic: 132.7 on 1 and 3137 DF,  p-value: < 2.2e-16

Well, it looks like number of authors per year is increasing by about 0.18 authors per year - but there’s some funky stuff going on with the residuals of that model, so I’d be a little hesitant to trust those findings.

Frequency of common terms in abstracts over time

Take a look at how the most common terms in abstracts have changed over time.

#Add year to abstract word counts, and count number of times each word occurs in each year.

df <-  full_join(abstract_df, author_df[,c("id", "year")], by = "id") %>%
  group_by(year, word) %>% 
  count() %>% 
  arrange(year, desc(n))

#Figure out most common terms and see how they've changed over time (could also choose terms of interest in another way):
top_words <- df %>% 
  group_by(word) %>%
  summarise(total = sum(n)) %>%
  arrange(desc(total))

top <- top_words[1:10,]

plot_dat <- df %>% filter(word %in% top$word) %>%
  filter(year < 2017) %>%
  filter(year > 1980)

ggplot(plot_dat, aes(x = year, y = n, group = word, color = word)) +
  geom_line() 

If we’re interested in doing more with this, two good next steps would be to normalize the data by total number of words in the corpus for each year (the corpus gets bigger over time, so words naturally become more frequent) and to think more about what words we’re actually interested in. It might be that combinations of words are really the most interesting (bark beetle? forest carbon? stream temperature?).

Semantic Networks

with keywords

Another thing we can do is look at which keywords or abstract words act as bridging concepts by treating the keywords or terms in abstracts as nodes in a network, where the edges are papers. Let’s start with keywords for simplicity. Earlier, we already made a network graph of keywords. Another way of assessing this is by testing the centrality metrics for different keywords.

graph_obj <- graph_from_data_frame(keyword_pairs, directed = FALSE)
ec <- eigen_centrality(graph_obj)[[1]]
ec_df <- data.frame(ec) 
ec_df <- ec_df %>% mutate(name = rownames(ec_df)) %>%
  arrange(desc(ec))
names(ec_df[1]) <- "eigen_centrality"

bet <- betweenness(graph_obj) %>%
  data.frame() 
bet <- bet %>% mutate(name = rownames(bet))
names(bet)[1] <- "betweenness"

deg <- degree(graph_obj) %>% 
  data.frame()
deg <- deg %>% mutate(name = rownames(deg))
names(deg)[1] <- "degree_centrality"

keyword_stats <- full_join(ec_df, bet, by = "name") %>%
  full_join(deg, by = "name") %>%
  select(name, everything()) %>%
  arrange(desc(degree_centrality)) 

#Add the frequency of each keyword for reference. 

freq <- keyword_df %>%
  group_by(keywords) %>%
  count() %>%
  rename("name" = "keywords")
keyword_stats <- left_join(keyword_stats, freq, by = "name")

kable(keyword_stats[1:20,])
name ec betweenness degree_centrality n
climate change 1.0000000 1208146.119 1422 444
climate 0.8519273 128232.473 700 176
forests 0.7401325 39942.966 528 96
temperature 0.6805871 40075.872 466 88
global warming 0.5490619 63682.013 393 59
models 0.6147894 14087.519 364 73
ecology 0.5943023 11313.304 358 50
ecosystems 0.5983748 10623.358 351 47
effects 0.5861596 9475.889 336 38
precipitation 0.5288523 35094.003 336 65
pines 0.5275373 12387.995 322 52
snow 0.4901485 23360.509 293 48
hydrology 0.4823323 19940.503 290 53
drought 0.4627442 19488.918 290 41
woody plants 0.4961344 7955.838 286 32
national parks 0.4822818 14362.226 271 33
wildfires 0.4990651 6896.379 264 34
mountain areas 0.4731524 8254.384 261 40
watersheds 0.4673826 10536.409 257 41
mountains 0.4698888 11721.581 255 35

The highest keywords in this list should be those that are the best connected to other keywords - that act as bridging concepts. However, they may also just be the most common ones. Another way to think about this is to see which keywords rank the lowest - which are not bridging concepts?

kable(tail(keyword_stats, 20))
name ec betweenness degree_centrality n
2116 paleoclimatology 0.0021559 0 2 1
2117 alberta 0.0015828 0 2 1
2118 national forest 0.0015050 0 2 1
2119 glacial geomorphology 0.0009659 0 2 1
2120 landscape evolution 0.0009659 0 2 1
2121 monitoring trends in burn severity 0.0007862 0 2 1
2122 forest disturbances 0.0007862 0 2 1
2123 timing of spring 0.0005932 0 2 1
2124 alpine treeline ecotone 0.0000000 0 2 1
2125 abiotic treeline controls 0.0000000 0 2 1
2126 dynamic response 0.0000000 0 2 1
2127 precipitation timing 0.0000000 0 2 1
2128 bacterial and fungal soil community composition 0.0000000 0 2 1
2129 environmental and vegetation effects 0.0000000 0 2 1
2130 scale dependence 0.0000000 0 2 1
2131 soil and atmospheric drought 0.0000000 0 2 1
2132 forest and meadow soils 0.0000000 0 2 1
2133 orographic precipitation 0.0086511 0 1 1
2134 climax communities 0.0014596 0 1 1
2135 western forest insects 0.0001061 0 1 1

Maybe there’s a way to control for the frequency of keywords, if that’s something we’re concerned about.

semantic networks with abstracts

Do we find the same patterns with abstract words as with keywords?

graph_obj <- graph_from_data_frame(abstract_word_pairs, directed = FALSE)
ec <- eigen_centrality(graph_obj)[[1]]
ec_df <- data.frame(ec) 
ec_df <- ec_df %>% mutate(name = rownames(ec_df)) %>%
  arrange(desc(ec))
names(ec_df[1]) <- "eigen_centrality"

bet <- betweenness(graph_obj) %>%
  data.frame() 
bet <- bet %>% mutate(name = rownames(bet))
names(bet)[1] <- "betweenness"

deg <- degree(graph_obj) %>% 
  data.frame()
deg <- deg %>% mutate(name = rownames(deg))
names(deg)[1] <- "degree_centrality"

abstract_stats <- full_join(ec_df, bet, by = "name") %>%
  full_join(deg, by = "name") %>%
  select(name, everything()) %>%
  arrange(desc(degree_centrality)) 

#Add the frequency of each abstract word for reference. 

freq <- abstract_df %>%
  group_by(word) %>%
  count() %>%
  rename("name" = "word")
abstract_stats <- left_join(abstract_stats, freq, by = "name")

kable(abstract_stats[1:20,])
name ec betweenness degree_centrality n
climate 1.0000000 1352220.8 7542 2000
change 0.9757264 1006650.8 6955 1067
results 0.9038545 429570.1 5480 408
data 0.8784773 360950.4 5142 439
study 0.8758785 356945.9 5120 404
mountain 0.8678757 358297.8 5029 415
temperature 0.8613191 307638.5 4927 720
western 0.8583111 306690.7 4846 437
model 0.8476313 301725.0 4809 597
species 0.8463399 288901.3 4808 772
based 0.8450738 267328.7 4672 307
future 0.8461055 247893.2 4638 437
conditions 0.8370237 243106.6 4548 326
effects 0.8384379 222812.7 4510 359
water 0.8230651 239067.1 4497 578
potential 0.8292180 218751.1 4437 269
range 0.8253281 225974.9 4417 342
precipitation 0.8122574 199892.5 4302 646
forest 0.8149408 201270.7 4293 544
models 0.8086865 187872.5 4218 336

What are the least well-connected words in abstracts?

kable(tail(abstract_stats, 20))
name ec betweenness degree_centrality n
8242 deliveries 0.0180357 0 46 1
8243 modsim 0.0180357 0 46 1
8244 incapable 0.0180357 0 46 1
8245 arranged 0.0229174 0 45 1
8246 arguing 0.0177760 0 45 1
8247 surrounded 0.0177760 0 45 1
8248 annular 0.0183722 0 41 1
8249 unseasonably 0.0183722 0 41 1
8250 epitomize 0.0198688 0 39 1
8251 nw 0.0137711 0 39 1
8252 behave 0.0137711 0 39 1
8253 rclimdex 0.0180382 0 36 1
8254 evidencing 0.0180382 0 36 1
8255 monatan 0.0176755 0 34 1
8256 entirety 0.0176755 0 34 1
8257 tenures 0.0152463 0 33 2
8258 comment 0.0117125 0 31 1
8259 papers 0.0117125 0 31 1
8260 editor 0.0117125 0 31 1
8261 facets 0.0117125 0 31 1

Plot a network of abstract words: (this is words that show up together 70 or more times - totally arbitrary)

set.seed(1234)
abstract_word_pairs %>%
  filter(n >= 70) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "springgreen4") +
  geom_node_point(size = 2) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()